# setwd("C:/Users/lnaeh/Desktop/STAT 302/302 Project")
library(tidyverse)
## -- Attaching packages --------
## v ggplot2 3.3.0 v purrr 0.3.4
## v tibble 3.0.0 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -----------------
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Introduction: The dataset we analyzed originated from the World Happiness Report in 2018. Happiness scores for each country is calculated based on a subset of factors including life expectancy, GDP per capita, social support, etc. We chose to analyze specific trends in this dataset by creating scatter and time series plots, linear models, and the implementation of ANOVA. Our goal is to identify trends in certain factors and understand the influences of happiness scores.
Data Frame Setup:
happyScore <- read_csv('happyScore.csv')
## Warning: Missing column names filled in: 'X4' [4], 'X5' [5], 'X6' [6], 'X7' [7],
## 'X8' [8], 'X9' [9], 'X10' [10]
## Parsed with column specification:
## cols(
## Country = col_character(),
## `Happiness score` = col_double(),
## Region = col_character(),
## X4 = col_logical(),
## X5 = col_logical(),
## X6 = col_logical(),
## X7 = col_logical(),
## X8 = col_logical(),
## X9 = col_logical(),
## X10 = col_logical()
## )
happyCategories <- read_csv('happyCategories.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Country name` = col_character()
## )
## See spec(...) for full column specifications.
happyCategories <-
happyCategories %>%
rename('Country' = 'Country name',
'Log_GDP_per_Capita' = 'Log GDP per capita',
'Life_Expectancy' = 'Healthy life expectancy at birth',
'Freedom' = 'Freedom to make life choices',
'Social_Support' = 'Social support',
'Trust' = 'Confidence in national government')
happyScore <-
happyScore %>%
rename('Happiness_Score' = 'Happiness score')
happyScore <-
happyScore %>%
select(Country, Happiness_Score, Region)
happyCategories2018 <-
happyCategories %>%
filter(Year == 2018)
happy <- inner_join(happyScore, happyCategories2018, by = 'Country')
head(happy, 10)
## # A tibble: 10 x 28
## Country Happiness_Score Region Year `Life Ladder` Log_GDP_per_Cap~
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Finland 7.77 Scand~ 2018 7.86 10.6
## 2 Denmark 7.6 Scand~ 2018 7.65 10.8
## 3 Norway 7.55 Scand~ 2018 7.44 11.1
## 4 Nether~ 7.49 Weste~ 2018 7.46 10.8
## 5 Switze~ 7.48 Weste~ 2018 7.51 11.0
## 6 Sweden 7.34 Scand~ 2018 7.37 10.8
## 7 New Ze~ 7.31 Pacif~ 2018 7.37 10.5
## 8 Canada 7.28 North~ 2018 7.18 10.7
## 9 Austria 7.25 Weste~ 2018 7.40 10.7
## 10 Austra~ 7.23 Pacif~ 2018 7.18 10.7
## # ... with 22 more variables: Social_Support <dbl>, Life_Expectancy <dbl>,
## # Freedom <dbl>, Generosity <dbl>, `Perceptions of corruption` <dbl>,
## # `Positive affect` <dbl>, `Negative affect` <dbl>, Trust <dbl>, `Democratic
## # Quality` <dbl>, `Delivery Quality` <dbl>, `Standard deviation of ladder by
## # country-year` <dbl>, `Standard deviation/Mean of ladder by
## # country-year` <dbl>, `GINI index (World Bank estimate)` <dbl>, `GINI index
## # (World Bank estimate), average 2000-16` <dbl>, `gini of household income
## # reported in Gallup, by wp5-year` <dbl>, `Most people can be trusted,
## # Gallup` <dbl>, `Most people can be trusted, WVS round 1981-1984` <dbl>,
## # `Most people can be trusted, WVS round 1989-1993` <dbl>, `Most people can
## # be trusted, WVS round 1994-1998` <dbl>, `Most people can be trusted, WVS
## # round 1999-2004` <dbl>, `Most people can be trusted, WVS round
## # 2005-2009` <dbl>, `Most people can be trusted, WVS round 2010-2014` <dbl>
tail(happy, 10)
## # A tibble: 10 x 28
## Country Happiness_Score Region Year `Life Ladder` Log_GDP_per_Cap~
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Madaga~ 3.93 Sub-S~ 2018 4.07 7.28
## 2 Burundi 3.78 Sub-S~ 2018 3.78 6.54
## 3 Zimbab~ 3.66 Sub-S~ 2018 3.62 7.55
## 4 Haiti 3.60 Carri~ 2018 3.61 7.42
## 5 Botswa~ 3.49 Sub-S~ 2018 3.46 9.68
## 6 Malawi 3.41 Sub-S~ 2018 3.33 7.01
## 7 Yemen 3.38 Middl~ 2018 3.06 NA
## 8 Rwanda 3.33 Sub-S~ 2018 3.56 7.57
## 9 Tanzan~ 3.23 Sub-S~ 2018 3.45 7.93
## 10 Afghan~ 3.20 South~ 2018 2.69 7.49
## # ... with 22 more variables: Social_Support <dbl>, Life_Expectancy <dbl>,
## # Freedom <dbl>, Generosity <dbl>, `Perceptions of corruption` <dbl>,
## # `Positive affect` <dbl>, `Negative affect` <dbl>, Trust <dbl>, `Democratic
## # Quality` <dbl>, `Delivery Quality` <dbl>, `Standard deviation of ladder by
## # country-year` <dbl>, `Standard deviation/Mean of ladder by
## # country-year` <dbl>, `GINI index (World Bank estimate)` <dbl>, `GINI index
## # (World Bank estimate), average 2000-16` <dbl>, `gini of household income
## # reported in Gallup, by wp5-year` <dbl>, `Most people can be trusted,
## # Gallup` <dbl>, `Most people can be trusted, WVS round 1981-1984` <dbl>,
## # `Most people can be trusted, WVS round 1989-1993` <dbl>, `Most people can
## # be trusted, WVS round 1994-1998` <dbl>, `Most people can be trusted, WVS
## # round 1999-2004` <dbl>, `Most people can be trusted, WVS round
## # 2005-2009` <dbl>, `Most people can be trusted, WVS round 2010-2014` <dbl>
# Scatter plots:
noNAHappy <-
happy %>%
drop_na(Life_Expectancy)
p1 <- ggplot() +
geom_point(data = noNAHappy, aes(x = Life_Expectancy, y = Happiness_Score, color = Region)) +
ggtitle('Life Expectancy vs. Happiness Score in 2018')
ggplotly(p1)
noNAHappy <-
noNAHappy %>%
drop_na(Log_GDP_per_Capita)
p2 <- ggplot() +
geom_point(data = noNAHappy, aes(x = Log_GDP_per_Capita, y = Happiness_Score, color = Region)) +
ggtitle('GDP per Capita vs. Happiness Score in 2018')
ggplotly(p2)
noNAHappy <-
noNAHappy %>%
drop_na(Social_Support)
p3 <- ggplot() +
geom_point(data = noNAHappy, aes(x = Social_Support, y = Happiness_Score, color = Region)) +
ggtitle('Social Support vs. Happiness Score in 2018')
ggplotly(p3)
# Next: Time series graph
tempTime <- inner_join(happyScore, happyCategories, by = 'Country')
happyTime1 <-
tempTime %>%
filter(Region == 'Sub-Saharan Africa') %>%
drop_na(Life_Expectancy, Log_GDP_per_Capita, Social_Support)
p4 <- ggplot(happyTime1, aes(y = Life_Expectancy, x = Year, color = Country)) +
geom_line() +
ggtitle('Life Expectancy from 2006-2019')
ggplotly(p4)
p5 <- ggplot(happyTime1, aes(y = Log_GDP_per_Capita, x = Year, color = Country)) +
geom_line() +
ggtitle('Log(GDP per Capita) from 2006-2019')
ggplotly(p5)
p6 <- ggplot(happyTime1, aes(y = Social_Support, x = Year, color = Country)) +
geom_line() +
ggtitle('Social Support from 2006-2019')
ggplotly(p6)
happyTime2 <-
tempTime %>%
filter(Region %in% c('Middle East', 'North Africa')) %>%
drop_na(Life_Expectancy, Log_GDP_per_Capita, Social_Support)
p7 <- ggplot(happyTime2, aes(y = Life_Expectancy, x = Year, color = Country)) +
geom_line() +
ggtitle('Life Expectancy from 2006-2019')
ggplotly(p7)
p8 <- ggplot(happyTime2, aes(y = Log_GDP_per_Capita, x = Year, color = Country)) +
geom_line() +
ggtitle('Log(GDP per Capita) from 2006-2019')
ggplotly(p8)
p9 <- ggplot(happyTime2, aes(y = Social_Support, x = Year, color = Country)) +
geom_line() +
ggtitle('Social Support from 2006-2019')
ggplotly(p9)
Summary: Evidently, the life expectancy time series in both Sub-Saharan Africa and the Middle East is displaying an increasing trend, whereas the GDP per Capita seems to be stagnant. On the other hand, social support in both regions seem to be flunctuating greatly, most likely due to the stability problems many countries are experiencing due to famine and civil wars.
We considered two regions in our ANOVA analysis: Europe and the Americas. We wanted to know if adding ‘country’ as a covariate in a linear model explaining several outcomes reduces the residual sum of squares compared to a model with just ‘region’ as a covariate. The outcomes we analyzed were life expectancy, GDP per capita, and social support from 2006 to 2018. When implementing an ANOVA model, we have to make several assumptions including no outliers, a normal distribution, and homogeneity of variance. Looking at the residuals vs. leverage plots, QQ plots, and scale-location plots for each model, we can determine if those assumptions hold in our analysis. None of the residuals vs leverage plots show significant leverage, meaning there are no outliers in our data for the countries in Europe and the Americas.
# Note: certain diagnostic plots are commented out for the sake of runtime and space. Please uncomment them if you want to take a look.
# Europe:
tempDF <- inner_join(happyScore, happyCategories, by = 'Country')
tempEurope <-
tempDF %>%
filter(Region %in% c('Scandinavia', 'Western Europe', 'Central and Eastern Europe'))
aEurope <- lm(Life_Expectancy ~ Region, data = tempEurope)
bEurope <- lm(Life_Expectancy ~ Region + Country, data = tempEurope)
anova(aEurope, bEurope)
## Analysis of Variance Table
##
## Model 1: Life_Expectancy ~ Region
## Model 2: Life_Expectancy ~ Region + Country
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 376 597.25
## 2 345 174.23 31 423.01 27.02 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot(aEurope)
#plot(bEurope)
#coef(aEurope)
cEurope <- lm(Log_GDP_per_Capita ~ Region, data = tempEurope)
dEurope <- lm(Log_GDP_per_Capita ~ Region + Country, data = tempEurope)
anova(cEurope, dEurope)
## Analysis of Variance Table
##
## Model 1: Log_GDP_per_Capita ~ Region
## Model 2: Log_GDP_per_Capita ~ Region + Country
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 373 42.289
## 2 342 1.930 31 40.359 230.73 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot(cEurope)
#plot(dEurope)
#coef(dEurope)
eEurope <- lm(Social_Support ~ Region, data = tempEurope)
fEurope <- lm(Social_Support ~ Region + Country, data = tempEurope)
anova(eEurope, fEurope)
## Analysis of Variance Table
##
## Model 1: Social_Support ~ Region
## Model 2: Social_Support ~ Region + Country
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 382 1.52509
## 2 350 0.42007 32 1.105 28.772 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot(eEurope)
#plot(fEurope)
#coef(fEurope)
# Americas:
tempAmericas <-
tempDF %>%
filter(Region %in% c('North America', 'Latin America'))
aAmericas <- lm(Life_Expectancy ~ Region, data = tempAmericas)
bAmericas <- lm(Life_Expectancy ~ Region + Country, data = tempAmericas)
anova(aAmericas, bAmericas)
## Analysis of Variance Table
##
## Model 1: Life_Expectancy ~ Region
## Model 2: Life_Expectancy ~ Region + Country
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 244 1302.96
## 2 227 151.54 17 1151.4 101.45 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot(aAmericas)
#plot(bAmericas)
#coef(bAmericas)
cAmericas <- lm(Log_GDP_per_Capita ~ Region, data = tempAmericas)
dAmericas <- lm(Log_GDP_per_Capita ~ Region + Country, data = tempAmericas)
anova(cAmericas, dAmericas)
## Analysis of Variance Table
##
## Model 1: Log_GDP_per_Capita ~ Region
## Model 2: Log_GDP_per_Capita ~ Region + Country
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 244 58.009
## 2 227 2.456 17 55.553 302.08 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot(cAmericas)
#plot(dAmericas)
#coef(dAmericas)
eAmericas <- lm(Social_Support ~ Region, data = tempAmericas)
fAmericas <- lm(Social_Support ~ Region + Country, data = tempAmericas)
anova(eAmericas, fAmericas)
## Analysis of Variance Table
##
## Model 1: Social_Support ~ Region
## Model 2: Social_Support ~ Region + Country
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 242 0.61081
## 2 225 0.18837 17 0.42245 29.682 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot(eAmericas)
#plot(fAmericas)
#coef(fAmericas)
Section 2 summary: The Normal Q-Q plots seem to be slightly skewed at the two ends, meaning that despite statistical significance holding, we need to be careful with our conclusion as the data is not normally distributed. The data in the residuals vs. fitted plots seem to be slightly off the horizontal. It is important to note that the data in the prognostic plots seem to be grouped, and therefore the variance of the comparison groups may not be reasonable, and thus causing a decrease in the models’ predictive powers. For the ANOVA models analyzing life expectancy and GDP per capita, the residual sum of squares decreased significantly in Europe as well as the Americas. Evidently, the differences in life expectancy and GDP per capita is explained by country affiliation/nationality. For these 4 models, the p-value was negligible, so the reduction in residual sum of squares is not due to randomness. For the ANOVA models analyzing social support, the residual sum of squares fell for the models including ‘country’ as a covariate, but not on the same magnitude as with the other two factors. The residual sum of squares was already extremely small, which implies that social support tends to be a strong indication of region. We still observed a statistically significant derease in the residual sum of squares; however, social support seems to be similar for governments in the same region. Note that data collection was missing for many nations in geographical regions such as Sub-Saharan Africa, Asia, and the Middle East. Therefore, the interpretation of an ANOVA model was only possible in Europe and the Americas, where missing data was not an issue.